This tutorial demonstrates how to use the FeaSc software for multi-sample pathway activity inference analysis. FeaSc is a Python package specifically designed for pathway activity analysis in single-cell RNA sequencing data, supporting multiple dimensionality reduction methods and batch effect correction. When analyzing multiple samples with batch effects, there are two main approaches: (1) ignore batch effects and perform pathway activity inference as done for single samples by integrating results from classical dimensionality reduction methods such as PCA, NMF and MCA, or (2) use scVI as a preprocessing step for batch effect correction.
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import scipy
import scvi
from numpy import unique
import FeaSc as fsc
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import roc_curve, auc
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = [6, 4]
plt.rcParams['figure.dpi'] = 100def plot_roc_curve(labels, scores, target_celltype, method_name="Method",
figsize=(6, 5), show_plot=True):
y_true = (labels == target_celltype).astype(int)
y_score = scores
fpr, tpr, thresholds = roc_curve(y_true, y_score)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=figsize)
plt.plot(fpr, tpr, linewidth=2,
label=f'{method_name} (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', alpha=0.7)
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title(f'ROC Curve for {target_celltype} Detection\n({method_name})',
fontsize=14)
plt.legend(loc='lower right', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 返回结果
return {
'fpr': fpr,
'tpr': tpr,
'auc': roc_auc,
'thresholds': thresholds
}In this section, we load and preprocess single-cell RNA sequencing data from the GSE96583 dataset, which contains immune cells under two conditions: with and without interferon stimulation.
adata = sc.read_h5ad('data/h5ad/GSE96583_IFN.h5ad')
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.highly_variable_genes(adata, n_top_genes=3000, flavor="seurat_v3")
adata = adata[:, adata.var.highly_variable]
adata.layers['counts'] = adata.X.copy()
adataAnnData object with n_obs × n_vars = 28871 × 3000
obs: 'tsne1', 'tsne2', 'ind', 'stim', 'cluster', 'cell', 'multiplets', 'n_genes'
var: 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
uns: 'hvg'
layers: 'counts'
adata.obs| tsne1 | tsne2 | ind | stim | cluster | cell | multiplets | n_genes | |
|---|---|---|---|---|---|---|---|---|
| cell_id | ||||||||
| AAACATACAATGCC-1 | -4.277833 | -19.294709 | 107 | ctrl | 5 | CD4 T cells | doublet | 852 |
| AAACATACATTTCC-1 | -27.640373 | 14.966629 | 1016 | ctrl | 9 | CD14+ Monocytes | singlet | 878 |
| AAACATACCAGAAA-1 | -27.493646 | 28.924885 | 1256 | ctrl | 9 | CD14+ Monocytes | singlet | 713 |
| AAACATACCAGCTA-1 | -28.132584 | 24.925484 | 1256 | ctrl | 9 | CD14+ Monocytes | doublet | 950 |
| AAACATACCATGCA-1 | -10.468194 | -5.984389 | 1488 | ctrl | 3 | CD4 T cells | singlet | 337 |
| … | … | … | … | … | … | … | … | … |
| TTTGCATGCTAAGC-1 | 25.142392 | 6.603815 | 107 | stim | 6 | CD4 T cells | singlet | 523 |
| TTTGCATGGGACGA-1 | 14.359657 | 10.965601 | 1488 | stim | 6 | CD4 T cells | singlet | 503 |
| TTTGCATGGTGAGG-1 | 27.317997 | 7.933458 | 1488 | stim | 6 | CD4 T cells | ambs | 448 |
| TTTGCATGGTTTGG-1 | 13.744084 | 9.347784 | 1244 | stim | 6 | CD4 T cells | ambs | 422 |
| TTTGCATGTCTTAC-1 | 14.572118 | -4.713942 | 1016 | stim | 5 | CD4 T cells | singlet | 421 |
28871 rows × 8 columns
Here we use scVI for batch correction. Running scVI on GPU will be much faster.
scvi.model.SCVI.setup_anndata(adata = adata, layer="counts",batch_key="stim")
model = scvi.model.SCVI(adata, n_latent=20)
model.train(max_epochs=400, early_stopping=True, early_stopping_patience=20)GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Training: 0%| | 0/400 [00:00<?, ?it/s]
`Trainer.fit` stopped: `max_epochs=400` reached.
reconstructed = model.get_normalized_expression(n_samples=10, return_mean=True,library_size="latent")reconstructed.iloc[:, :10]| GeneSymbol | HES4 | ISG15 | TNFRSF18 | TNFRSF4 | MXRA8 | ANKRD65 | NADK | PRKCZ | FAM213B | RP11-46F15.2 |
|---|---|---|---|---|---|---|---|---|---|---|
| cell_id | ||||||||||
| AAACATACAATGCC-1 | 0.019033 | 0.210214 | 0.008682 | 0.012635 | 0.008505 | 1.966600e-07 | 0.007852 | 0.016129 | 0.056992 | 0.001184 |
| AAACATACATTTCC-1 | 0.124047 | 1.775051 | 0.130992 | 0.141307 | 0.000300 | 5.500610e-03 | 0.187209 | 0.006264 | 0.024018 | 0.003484 |
| AAACATACCAGAAA-1 | 0.039507 | 0.192389 | 0.193688 | 0.164865 | 0.001958 | 2.018716e-04 | 0.075346 | 0.010887 | 0.006039 | 0.000531 |
| AAACATACCAGCTA-1 | 0.125339 | 1.100067 | 0.320633 | 0.149594 | 0.000137 | 4.099355e-05 | 0.098459 | 0.040244 | 0.003708 | 0.000075 |
| AAACATACCATGCA-1 | 0.031996 | 0.497427 | 0.013899 | 0.041660 | 0.000922 | 3.332141e-06 | 0.008444 | 0.008473 | 0.000527 | 0.001109 |
| … | … | … | … | … | … | … | … | … | … | … |
| TTTGCATGCTAAGC-1 | 0.022001 | 6.526540 | 0.022284 | 0.072436 | 0.000079 | 8.381066e-06 | 0.009712 | 0.011650 | 0.005869 | 0.000226 |
| TTTGCATGGGACGA-1 | 0.007349 | 5.203463 | 0.011646 | 0.037932 | 0.000172 | 8.764559e-07 | 0.016608 | 0.011724 | 0.011683 | 0.001016 |
| TTTGCATGGTGAGG-1 | 0.018921 | 4.360673 | 0.008313 | 0.025147 | 0.000127 | 3.003887e-06 | 0.004238 | 0.021204 | 0.007711 | 0.000681 |
| TTTGCATGGTTTGG-1 | 0.020848 | 5.322153 | 0.016481 | 0.020105 | 0.000611 | 5.446282e-04 | 0.019620 | 0.004102 | 0.004913 | 0.001470 |
| TTTGCATGTCTTAC-1 | 0.048356 | 5.451257 | 0.093828 | 0.541265 | 0.000064 | 7.868779e-06 | 0.010495 | 0.018504 | 0.020468 | 0.002103 |
28871 rows × 10 columns
adata.X = reconstructed.valueslatent = model.get_latent_representation()
adata.obsm['X_scVI'] = latent
sc.pp.neighbors(adata, use_rep='X_scVI', n_neighbors=15)
sc.tl.umap(adata, min_dist=0.3)sc.pl.umap(adata, color=['stim'])sc.pl.umap(adata, color=['cell'])path_bg = 'data/c2.cp.v2025.1.Hs.symbols.gmt'
path_fg = 'data/B-cell_marker.txt'
gs = pd.read_table(path_fg, header=0).iloc[:, 0].tolist()
gsList = {'gs': gs}
bgList = fsc.read_gmt(path_bg)
grate = fsc.build_gene_rate(bg_list=bgList, gs_list=gsList)
grate| geneset | background | |
|---|---|---|
| MUC16 | 0.0 | 0.002982 |
| SMYD1 | 0.0 | 0.000746 |
| ZNF14 | 0.0 | 0.000497 |
| DLGAP4 | 0.0 | 0.001243 |
| ZZZ3 | 0.0 | 0.001243 |
| … | … | … |
| UCK2 | 0.0 | 0.001740 |
| ENTPD1 | 0.0 | 0.002485 |
| RDH16 | 0.0 | 0.001740 |
| USP48 | 0.0 | 0.000746 |
| SLCO1B1 | 0.0 | 0.005716 |
13871 rows × 2 columns
adata = fsc.run_reduction(adata, n_dim=15, method="mca")run mca...
mca step 1: Construct the Fuzzy Matrix and Row Weights
mca step 1 completed: Fuzzy Matrix and Row Weights constructed
svd started
svd completed
mca step 2: Calculate Cell Coordinates and Feature Loadings
mca step 2 completed: Cell Coordinates and Feature Loadings calculated
mca_scores = fsc.calculate_activity(adata, grate, method="mca", n_dim=12)celltype_labels = adata.obs['cell'].copy()
result = plot_roc_curve(
labels=celltype_labels,
scores=mca_scores,
target_celltype="B cells",
method_name="MCA-based"
)We continue to score the IFN pathway and test whether the scores are higher in stim condition compared to ctrl.
path_bg = 'data/c2.cp.v2025.1.Hs.symbols.gmt'
path_fg = 'data/IFN/INF_SIG.txt'
gs = pd.read_table(path_fg, header=0).iloc[:, 0].tolist()
gsList = {'gs': gs}
bgList = fsc.read_gmt(path_bg)
grate = fsc.build_gene_rate(bg_list=bgList, gs_list=gsList)
grate| geneset | background | |
|---|---|---|
| MUC16 | 0.0 | 0.002982 |
| SMYD1 | 0.0 | 0.000746 |
| ZNF14 | 0.0 | 0.000497 |
| DLGAP4 | 0.0 | 0.001243 |
| ZZZ3 | 0.0 | 0.001243 |
| … | … | … |
| UCK2 | 0.0 | 0.001740 |
| ENTPD1 | 0.0 | 0.002485 |
| RDH16 | 0.0 | 0.001740 |
| USP48 | 0.0 | 0.000746 |
| SLCO1B1 | 0.0 | 0.005716 |
13868 rows × 2 columns
mca_scores = fsc.calculate_activity(adata, grate, method="mca",n_dim=12)celltype_labels = adata.obs['stim'].copy()
result = plot_roc_curve(
labels=celltype_labels,
scores=mca_scores,
target_celltype="stim",
method_name="MCA-based"
)